solution inspired by the Netflix challenge winners
%load_ext autoreload
%autoreload 2
import sys; sys.path.append('../')
from pathlib import Path
import pandas as pd
import plotly.express as px
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.nn import MSELoss
from src.models.mf.model import MatrixFactorization
from src.models.mf.trainer import train_mf
from src.models.mf.util import get_dataloaders
from src.util.data import get_train_test_ratings
import src.util.metrics as metrics
from src.util.plot import Plot
from src.util.discretizer import RatingDiscretizer
plot = Plot()
RATINGS_PATH = Path('../data/ratings_small.csv')
OUTPUT_PATH = Path('../models/matrix_factorization2.pt')
ratings = pd.read_csv(RATINGS_PATH)
user_encoder = LabelEncoder()
user_encoder.fit(ratings['userId'].values)
movie_encoder = LabelEncoder()
movie_encoder.fit(ratings['movieId'].values);
train_ratings, test_ratings = get_train_test_ratings(ratings)
train_ratings_reindexed = train_ratings.copy()
train_ratings_reindexed['userId'] = \
user_encoder.transform(train_ratings_reindexed['userId'].values)
train_ratings_reindexed['movieId'] = \
movie_encoder.transform(train_ratings_reindexed['movieId'].values)
train_loader, test_loader = \
get_dataloaders(train_ratings, test_ratings,
user_encoder, movie_encoder)
rating_discretizer = RatingDiscretizer()
train_discretized_ratings = rating_discretizer.fit_transform(train_ratings)
test_discretized_ratings = rating_discretizer.transform(test_ratings)
results = []
hiperparams_list = [
{'seed':0, 'h_dim': 5, 'lr':1e-2, 'L2_reg':[1.6, 1.6], 'MLP':False},
{'seed':0, 'h_dim': 20, 'lr':1e-2, 'L2_reg':[3.0, 3.0], 'MLP':False},
{'seed':0, 'h_dim': 160, 'lr':1e-2, 'L2_reg':[7.3, 7.3], 'MLP':False},
{'seed':0, 'h_dim': 160, 'lr':1e-2, 'L2_reg':[7.3, 7.3], 'MLP':True},
]
for hparams in hiperparams_list:
torch.manual_seed(hparams['seed'])
model = MatrixFactorization(
users_dim=len(user_encoder.classes_),
movies_dim=len(movie_encoder.classes_),
h_dim=hparams['h_dim'],
use_mlp=hparams['MLP']
)
model.initialize(train_ratings_reindexed, embedding_rescaler=0.01)
model_metrics = train_mf(
model, train_loader, test_loader,
epochs=30, lr=hparams['lr'],
regularizing_params=hparams['L2_reg'],
comment=f'MLP={hparams["MLP"]}'\
f'_seed{hparams["seed"]}'
f'_lr={hparams["lr"]}'
f'_regL2={hparams["L2_reg"]}'
f'_h_dim={hparams["h_dim"]}')
results.append({
**model_metrics,
'seed': hparams['seed'],
'lr': hparams['lr'],
'user_embed_regularizer': hparams['L2_reg'][0],
'movie_embed_regularizer': hparams['L2_reg'][1],
'embedding_dim': hparams['h_dim'],
'extra_MLP_bias': hparams['MLP'],
})
pd.DataFrame(results)
| RMSE | Loss | seed | lr | user_embed_regularizer | movie_embed_regularizer | embedding_dim | extra_MLP_bias | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.893369 | 0.798108 | 0 | 0.01 | 1.6 | 1.6 | 5 | False |
| 1 | 0.884281 | 0.781953 | 0 | 0.01 | 3.0 | 3.0 | 20 | False |
| 2 | 0.878033 | 0.770942 | 0 | 0.01 | 7.3 | 7.3 | 160 | False |
| 3 | 0.863952 | 0.746413 | 0 | 0.01 | 7.3 | 7.3 | 160 | True |
# save the last model (with MLP)
model.set_label_encoders(user_encoder, movie_encoder)
torch.save(model, OUTPUT_PATH)
# # load the saved model
# model = torch.load(OUTPUT_PATH)
with torch.no_grad():
mean_reciprocal_rank, reciprocal_ranks = metrics.mean_reciprocal_rank(
test_discretized_ratings,
model
)
with torch.no_grad():
mean_average_precision, average_precisions = metrics.mean_average_precision(
test_discretized_ratings,
model
)
with torch.no_grad():
mean_ndcg, ndcg_ranks = metrics.mean_ndcg(
test_discretized_ratings,
model
)
with torch.no_grad():
coverage = metrics.coverage(
test_discretized_ratings,
model
)
with torch.no_grad():
rmse = metrics.rmse(
test_discretized_ratings,
model
)
print(f"Mean Reciprocal Rank: {(mean_reciprocal_rank * 100):.2f}%")
print(f"Mean Average Precision: {(mean_average_precision * 100):.2f}%")
print(f"Mean NDCG: {(mean_ndcg * 100):.2f}%")
print(f"Coverage: {(coverage * 100):.2f}%")
print(f"RMSE: {(rmse):.6f}")
Mean Reciprocal Rank: 71.08% Mean Average Precision: 65.97% Mean NDCG: 78.87% Coverage: 100.00% RMSE: 0.863952
fig = px.histogram(
x=reciprocal_ranks,
marginal="box",
title="Reciprocal Rank Distribution",
labels={
"x": "Reciprocal Rank"
},
)
fig.show()
fig = px.histogram(
x=average_precisions,
marginal="box",
title="Average Precision Distribution",
labels={
"x": "Average Precision"
},
)
fig.show()
fig = px.histogram(
x=ndcg_ranks,
marginal="box",
title="NDCG Score Distribution",
labels={
"x": "NDCG Score"
},
)
fig.show()